#import necessary Python libraries
import pandas as pd
import folium
import datetime
import numpy as np
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
# read csv files
df1 = pd.read_csv('2020.csv') # the data is published on Kaggle https://www.kaggle.com/datasets/onlyrohit/crimes-in-chicago
df2 = pd.read_csv('2021.csv')
df3 = pd.read_csv('2022.csv')
pop = pd.read_excel('chicagopop.csv') # population of each community area on Wikipedia and the region they belong https://en.wikipedia.org/wiki/Community_areas_in_Chicago
df1 = pd.concat([df1, df2], axis=0) #concat all the dataframes of each year into 1 dataframe
df = pd.concat([df1, df3], axis=0)
#drop unecessary columns that we will not use
df = df.drop(columns=['Case Number','IUCR','Description', 'Beat', "District", "Ward", "FBI Code", "X Coordinate", "Y Coordinate", "Updated On",'Location'])
df = df.dropna() #drop na values
pop.head(3) #check that we can see the data
| Name | Population | Area | |
|---|---|---|---|
| 0 | Rogers Park | 55628 | Far North Side |
| 1 | West Ridge | 77122 | Far North Side |
| 2 | Uptown | 57182 | Far North Side |
df.head(3)
| ID | Date | Block | Primary Type | Location Description | Arrest | Domestic | Community Area | Year | Latitude | Longitude | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12016034 | 01/01/2020 12:00:00 AM | 018XX N WINNEBAGO AVE | DECEPTIVE PRACTICE | APARTMENT | False | False | 22.0 | 2020 | 41.915306 | -87.686639 |
| 1 | 12220321 | 01/01/2020 12:00:00 AM | 091XX S DREXEL AVE | OFFENSE INVOLVING CHILDREN | RESIDENCE | False | True | 47.0 | 2020 | 41.728192 | -87.600985 |
| 2 | 12013828 | 01/01/2020 12:00:00 AM | 044XX S LAVERGNE AVE | CRIMINAL SEXUAL ASSAULT | APARTMENT | False | False | 56.0 | 2020 | 41.812274 | -87.748177 |
#we observe that we can insert the community area number of each neighbourhood by reseting the index + 1
pop = pop.reset_index()
pop.rename(columns={'index':'Community Area'}, inplace=True)
# increase the index of all rows by 1
pop['Community Area'] = pop['Community Area'] + 1
from datetime import datetime
# extract Hour from Date column and store it in new columns called 'Month','Hour'
df['Hour'] = df['Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p').hour)
# extract month from Date column and store it in a new column called 'Month'
df['Month'] = df['Date'].apply(lambda x: datetime.strptime(x, '%m/%d/%Y %I:%M:%S %p').month)
df = pd.merge(df, pop, on='Community Area', how='inner') #merge with pop, so all the data is in df dataframe
df.head(3)
| ID | Date | Block | Primary Type | Location Description | Arrest | Domestic | Community Area | Year | Latitude | Longitude | Hour | Month | Name | Population | Area | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 12016034 | 01/01/2020 12:00:00 AM | 018XX N WINNEBAGO AVE | DECEPTIVE PRACTICE | APARTMENT | False | False | 22.0 | 2020 | 41.915306 | -87.686639 | 0 | 1 | Logan Square | 71665 | North Side |
| 1 | 11950278 | 01/01/2020 12:01:00 AM | 038XX W FULLERTON AVE | OTHER OFFENSE | RESIDENCE | False | True | 22.0 | 2020 | 41.924536 | -87.722407 | 0 | 1 | Logan Square | 71665 | North Side |
| 2 | 11939212 | 01/01/2020 01:20:00 AM | 020XX N CALIFORNIA AVE | CRIMINAL DAMAGE | CONVENIENCE STORE | True | False | 22.0 | 2020 | 41.917654 | -87.697205 | 1 | 1 | Logan Square | 71665 | North Side |
import folium
from folium.plugins import HeatMap
from folium.plugins import MarkerCluster
m = folium.Map([41.881,-87.623], zoom_start=14) #latitude and longitude of Chicago
latlon_2022 = df[['Latitude','Longitude']]
crime_heatmap = folium.Map(location= [41.881,-87.623],
tiles = "Stamen Toner",
zoom_start = 12)
HeatMap(latlon_2022, min_opacity=0.05).add_to(crime_heatmap)
crime_heatmap